Project Description:
There are some columns where the values are object, we need transform this values in numerical.
Questions:
# Manipulação de dados:
import pandas as pd
import missingno as msno
from collections import Counter
from warnings import filterwarnings
# Visualização Gráfica:
import seaborn as sns
import matplotlib as plt
import plotly.express as px
# Modelos de Classificação
from sklearn.linear_model import LogisticRegression,RidgeClassifier,SGDClassifier,PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC,LinearSVC,NuSVC
from sklearn.neighbors import KNeighborsClassifier,NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.ensemble import VotingClassifier
# Evolução :
from sklearn.metrics import precision_score,accuracy_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,RepeatedStratifiedKFold
data = pd.read_csv('../input/heart-failure-prediction/heart.csv')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 918 entries, 0 to 917 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 918 non-null int64 1 Sex 918 non-null object 2 ChestPainType 918 non-null object 3 RestingBP 918 non-null int64 4 Cholesterol 918 non-null int64 5 FastingBS 918 non-null int64 6 RestingECG 918 non-null object 7 MaxHR 918 non-null int64 8 ExerciseAngina 918 non-null object 9 Oldpeak 918 non-null float64 10 ST_Slope 918 non-null object 11 HeartDisease 918 non-null int64 dtypes: float64(1), int64(6), object(5) memory usage: 86.2+ KB
Note:
For Describe and Visualization the DataSet , I used Pandas-Profiling library
pip install pandas-profiling
Requirement already satisfied: pandas-profiling in /opt/conda/lib/python3.7/site-packages (3.0.0)
Requirement already satisfied: phik>=0.11.1 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (0.11.2)
Requirement already satisfied: numpy>=1.16.0 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (1.19.5)
Requirement already satisfied: pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (1.3.2)
Requirement already satisfied: pydantic>=1.8.1 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (1.8.2)
Requirement already satisfied: missingno>=0.4.2 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (0.4.2)
Requirement already satisfied: visions[type_image_path]==0.7.1 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (0.7.1)
Requirement already satisfied: requests>=2.24.0 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (2.25.1)
Requirement already satisfied: htmlmin>=0.1.12 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (0.1.12)
Requirement already satisfied: scipy>=1.4.1 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (1.7.1)
Requirement already satisfied: tangled-up-in-unicode==0.1.0 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (0.1.0)
Requirement already satisfied: seaborn>=0.10.1 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (0.11.2)
Requirement already satisfied: tqdm>=4.48.2 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (4.62.1)
Requirement already satisfied: jinja2>=2.11.1 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (3.0.1)
Requirement already satisfied: joblib in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (1.0.1)
Requirement already satisfied: PyYAML>=5.0.0 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (5.4.1)
Requirement already satisfied: matplotlib>=3.2.0 in /opt/conda/lib/python3.7/site-packages (from pandas-profiling) (3.4.3)
Requirement already satisfied: networkx>=2.4 in /opt/conda/lib/python3.7/site-packages (from visions[type_image_path]==0.7.1->pandas-profiling) (2.5)
Requirement already satisfied: multimethod==1.4 in /opt/conda/lib/python3.7/site-packages (from visions[type_image_path]==0.7.1->pandas-profiling) (1.4)
Requirement already satisfied: bottleneck in /opt/conda/lib/python3.7/site-packages (from visions[type_image_path]==0.7.1->pandas-profiling) (1.3.2)
Requirement already satisfied: attrs>=19.3.0 in /opt/conda/lib/python3.7/site-packages (from visions[type_image_path]==0.7.1->pandas-profiling) (21.2.0)
Requirement already satisfied: Pillow in /opt/conda/lib/python3.7/site-packages (from visions[type_image_path]==0.7.1->pandas-profiling) (8.3.1)
Requirement already satisfied: imagehash in /opt/conda/lib/python3.7/site-packages (from visions[type_image_path]==0.7.1->pandas-profiling) (4.2.1)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.7/site-packages (from jinja2>=2.11.1->pandas-profiling) (2.0.1)
Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.7/site-packages (from matplotlib>=3.2.0->pandas-profiling) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib>=3.2.0->pandas-profiling) (1.3.1)
Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.7/site-packages (from matplotlib>=3.2.0->pandas-profiling) (2.8.0)
Requirement already satisfied: pyparsing>=2.2.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib>=3.2.0->pandas-profiling) (2.4.7)
Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from cycler>=0.10->matplotlib>=3.2.0->pandas-profiling) (1.15.0)
Requirement already satisfied: decorator>=4.3.0 in /opt/conda/lib/python3.7/site-packages (from networkx>=2.4->visions[type_image_path]==0.7.1->pandas-profiling) (5.0.9)
Requirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.7/site-packages (from pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3->pandas-profiling) (2021.1)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.7/site-packages (from pydantic>=1.8.1->pandas-profiling) (3.7.4.3)
Requirement already satisfied: chardet<5,>=3.0.2 in /opt/conda/lib/python3.7/site-packages (from requests>=2.24.0->pandas-profiling) (4.0.0)
Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests>=2.24.0->pandas-profiling) (2021.5.30)
Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests>=2.24.0->pandas-profiling) (2.10)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests>=2.24.0->pandas-profiling) (1.26.6)
Requirement already satisfied: PyWavelets in /opt/conda/lib/python3.7/site-packages (from imagehash->visions[type_image_path]==0.7.1->pandas-profiling) (1.1.1)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
Note: you may need to restart the kernel to use updated packages.
from pandas_profiling import ProfileReport
profile = ProfileReport(data, title='Pandas Profiling Report to Dataset')
profile
Missing Values:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 918 entries, 0 to 917 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 918 non-null int64 1 Sex 918 non-null object 2 ChestPainType 918 non-null object 3 RestingBP 918 non-null int64 4 Cholesterol 918 non-null int64 5 FastingBS 918 non-null int64 6 RestingECG 918 non-null object 7 MaxHR 918 non-null int64 8 ExerciseAngina 918 non-null object 9 Oldpeak 918 non-null float64 10 ST_Slope 918 non-null object 11 HeartDisease 918 non-null int64 dtypes: float64(1), int64(6), object(5) memory usage: 86.2+ KB
data.isnull().sum()
Age 0 Sex 0 ChestPainType 0 RestingBP 0 Cholesterol 0 FastingBS 0 RestingECG 0 MaxHR 0 ExerciseAngina 0 Oldpeak 0 ST_Slope 0 HeartDisease 0 dtype: int64
fig = msno.matrix(data, color=(0,0.6,0.8))
#data.dropna(inplace=True)
#data.info()
Note:
There are some columns where the values are object, we need transform this values in numerical.
data.head()
| Age | Sex | ChestPainType | RestingBP | Cholesterol | FastingBS | RestingECG | MaxHR | ExerciseAngina | Oldpeak | ST_Slope | HeartDisease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40 | M | ATA | 140 | 289 | 0 | Normal | 172 | N | 0.0 | Up | 0 |
| 1 | 49 | F | NAP | 160 | 180 | 0 | Normal | 156 | N | 1.0 | Flat | 1 |
| 2 | 37 | M | ATA | 130 | 283 | 0 | ST | 98 | N | 0.0 | Up | 0 |
| 3 | 48 | F | ASY | 138 | 214 | 0 | Normal | 108 | Y | 1.5 | Flat | 1 |
| 4 | 54 | M | NAP | 150 | 195 | 0 | Normal | 122 | N | 0.0 | Up | 0 |
# Sex
data["Sex"] = data["Sex"].map({"M":1, "F":2})
data.head()
| Age | Sex | ChestPainType | RestingBP | Cholesterol | FastingBS | RestingECG | MaxHR | ExerciseAngina | Oldpeak | ST_Slope | HeartDisease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40 | 1 | ATA | 140 | 289 | 0 | Normal | 172 | N | 0.0 | Up | 0 |
| 1 | 49 | 2 | NAP | 160 | 180 | 0 | Normal | 156 | N | 1.0 | Flat | 1 |
| 2 | 37 | 1 | ATA | 130 | 283 | 0 | ST | 98 | N | 0.0 | Up | 0 |
| 3 | 48 | 2 | ASY | 138 | 214 | 0 | Normal | 108 | Y | 1.5 | Flat | 1 |
| 4 | 54 | 1 | NAP | 150 | 195 | 0 | Normal | 122 | N | 0.0 | Up | 0 |
# ChestPain Type
data["ChestPainType"] = data["ChestPainType"].map({"TA":1, "ATA":2, "NAP":3,"ASY":4})
data.head()
| Age | Sex | ChestPainType | RestingBP | Cholesterol | FastingBS | RestingECG | MaxHR | ExerciseAngina | Oldpeak | ST_Slope | HeartDisease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40 | 1 | 2 | 140 | 289 | 0 | Normal | 172 | N | 0.0 | Up | 0 |
| 1 | 49 | 2 | 3 | 160 | 180 | 0 | Normal | 156 | N | 1.0 | Flat | 1 |
| 2 | 37 | 1 | 2 | 130 | 283 | 0 | ST | 98 | N | 0.0 | Up | 0 |
| 3 | 48 | 2 | 4 | 138 | 214 | 0 | Normal | 108 | Y | 1.5 | Flat | 1 |
| 4 | 54 | 1 | 3 | 150 | 195 | 0 | Normal | 122 | N | 0.0 | Up | 0 |
# Resting ECG
data["RestingECG"] = data['RestingECG'].map({"Normal":1,"ST":2, "LVH":3 })
data.head()
| Age | Sex | ChestPainType | RestingBP | Cholesterol | FastingBS | RestingECG | MaxHR | ExerciseAngina | Oldpeak | ST_Slope | HeartDisease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40 | 1 | 2 | 140 | 289 | 0 | 1 | 172 | N | 0.0 | Up | 0 |
| 1 | 49 | 2 | 3 | 160 | 180 | 0 | 1 | 156 | N | 1.0 | Flat | 1 |
| 2 | 37 | 1 | 2 | 130 | 283 | 0 | 2 | 98 | N | 0.0 | Up | 0 |
| 3 | 48 | 2 | 4 | 138 | 214 | 0 | 1 | 108 | Y | 1.5 | Flat | 1 |
| 4 | 54 | 1 | 3 | 150 | 195 | 0 | 1 | 122 | N | 0.0 | Up | 0 |
# Exercise Aginine
data["ExerciseAngina"] = data['ExerciseAngina'].map( {"Y":1,"N":2 })
data.head()
| Age | Sex | ChestPainType | RestingBP | Cholesterol | FastingBS | RestingECG | MaxHR | ExerciseAngina | Oldpeak | ST_Slope | HeartDisease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40 | 1 | 2 | 140 | 289 | 0 | 1 | 172 | 2 | 0.0 | Up | 0 |
| 1 | 49 | 2 | 3 | 160 | 180 | 0 | 1 | 156 | 2 | 1.0 | Flat | 1 |
| 2 | 37 | 1 | 2 | 130 | 283 | 0 | 2 | 98 | 2 | 0.0 | Up | 0 |
| 3 | 48 | 2 | 4 | 138 | 214 | 0 | 1 | 108 | 1 | 1.5 | Flat | 1 |
| 4 | 54 | 1 | 3 | 150 | 195 | 0 | 1 | 122 | 2 | 0.0 | Up | 0 |
# ST Slope
data["ST_Slope"] = data['ST_Slope'].map({"Up":1, "Flat":2, "Down": 3} )
data.head()
| Age | Sex | ChestPainType | RestingBP | Cholesterol | FastingBS | RestingECG | MaxHR | ExerciseAngina | Oldpeak | ST_Slope | HeartDisease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40 | 1 | 2 | 140 | 289 | 0 | 1 | 172 | 2 | 0.0 | 1 | 0 |
| 1 | 49 | 2 | 3 | 160 | 180 | 0 | 1 | 156 | 2 | 1.0 | 2 | 1 |
| 2 | 37 | 1 | 2 | 130 | 283 | 0 | 2 | 98 | 2 | 0.0 | 1 | 0 |
| 3 | 48 | 2 | 4 | 138 | 214 | 0 | 1 | 108 | 1 | 1.5 | 2 | 1 |
| 4 | 54 | 1 | 3 | 150 | 195 | 0 | 1 | 122 | 2 | 0.0 | 1 | 0 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 918 entries, 0 to 917 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 918 non-null int64 1 Sex 918 non-null int64 2 ChestPainType 918 non-null int64 3 RestingBP 918 non-null int64 4 Cholesterol 918 non-null int64 5 FastingBS 918 non-null int64 6 RestingECG 918 non-null int64 7 MaxHR 918 non-null int64 8 ExerciseAngina 918 non-null int64 9 Oldpeak 918 non-null float64 10 ST_Slope 918 non-null int64 11 HeartDisease 918 non-null int64 dtypes: float64(1), int64(11) memory usage: 86.2 KB
X = data.drop("HeartDisease", axis=1 )
X.head()
| Age | Sex | ChestPainType | RestingBP | Cholesterol | FastingBS | RestingECG | MaxHR | ExerciseAngina | Oldpeak | ST_Slope | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40 | 1 | 2 | 140 | 289 | 0 | 1 | 172 | 2 | 0.0 | 1 |
| 1 | 49 | 2 | 3 | 160 | 180 | 0 | 1 | 156 | 2 | 1.0 | 2 |
| 2 | 37 | 1 | 2 | 130 | 283 | 0 | 2 | 98 | 2 | 0.0 | 1 |
| 3 | 48 | 2 | 4 | 138 | 214 | 0 | 1 | 108 | 1 | 1.5 | 2 |
| 4 | 54 | 1 | 3 | 150 | 195 | 0 | 1 | 122 | 2 | 0.0 | 1 |
y = data["HeartDisease"]
y.head()
0 0 1 1 2 0 3 1 4 0 Name: HeartDisease, dtype: int64
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state = 15)
len(x_test),len(x_train)
(184, 734)
filterwarnings('ignore')
models = [("LR", LogisticRegression(max_iter=1000)),
("SVC", SVC()),
("KNC", KNeighborsClassifier(n_neighbors=10)),
("DTC", DecisionTreeClassifier()),
("GNB", GaussianNB()),
("SGDC", SGDClassifier()),
("Perc", Perceptron()),
("NC", NearestCentroid()),
("Ridge", RidgeClassifier()),
("NuSVC", NuSVC()),
("BNB", BernoulliNB()),
("RF", RandomForestClassifier()),
("ADA", AdaBoostClassifier()),
("XGB", GradientBoostingClassifier()),
("PAC", PassiveAggressiveClassifier())
]
results = []
names=[]
finalresults=[]
for name, model in models:
model.fit(x_train, y_train)
model_results = model.predict(x_test)
score= precision_score(y_test, model_results, average='macro')
results.append(score)
names.append(name)
finalresults.append((name,score))
finalresults.sort(key=lambda k:k[1], reverse=True)
finalresults
[('ADA', 0.8383966244725738),
('XGB', 0.8338206627680311),
('RF', 0.8314221612863019),
('GNB', 0.8272098294166765),
('NuSVC', 0.8216346153846154),
('LR', 0.8185240034555104),
('Ridge', 0.8174311926605504),
('BNB', 0.7720725910989485),
('DTC', 0.759338061465721),
('PAC', 0.7357954545454546),
('SGDC', 0.7254335260115607),
('SVC', 0.7183006535947712),
('KNC', 0.7019378470991374),
('Perc', 0.6901041666666667),
('NC', 0.571837931494069)]
Note:
The 3 best models are : Ridge, RF and XGB
# Grid search and space:
models_params= {
"RF":{'model':RandomForestClassifier(),
'params':{
'max_features': list(range(1,10)),
'n_estimators':[10,100,1000]
}},
'Ridge':{'model':RidgeClassifier(),
'params':{
'solver':['auto','svd','cholesky','lsqr','sparse_cg','sag','saga'],
}},
'XGB':{'model':GradientBoostingClassifier(),
'params':{
'learning_rate':[0.0001,0.001,0.01,0.1],
'n_estimators':[100,200,500,1000],
'max_features':['sqrt','log2'],
'max_depth':list(range(11))
}}
}
# Evaluation:
cv = RepeatedStratifiedKFold(n_splits=5,n_repeats=20)
# Search:
scores=[]
for model_name, params in models_params.items():
rs = RandomizedSearchCV(params['model'], params['params'], cv=cv , n_iter=10)
rs.fit(x_train,y_train)
scores.append([model_name,dict(rs.best_params_),rs.best_score_])
data=pd.DataFrame(scores,columns=['Model','Parameters','Score'])
data
| Model | Parameters | Score | |
|---|---|---|---|
| 0 | RF | {'n_estimators': 1000, 'max_features': 4} | 0.871325 |
| 1 | Ridge | {'solver': 'auto'} | 0.854617 |
| 2 | XGB | {'n_estimators': 500, 'max_features': 'log2', ... | 0.879636 |
Note: